library(readr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ dplyr 1.0.10
## ✔ tibble 3.1.8 ✔ stringr 1.4.0
## ✔ tidyr 1.2.0 ✔ forcats 0.5.2
## ✔ purrr 1.0.1
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(scales)
## Warning: package 'scales' was built under R version 4.2.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(openair)
## Warning: package 'openair' was built under R version 4.2.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.2
## corrplot 0.92 loaded
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.2
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
confirmed <- read_csv("C:/Users/SOUMIK/Desktop/DV J-component/time_series_covid_19_confirmed.csv",col_types = cols(.default = col_double(), `Province/State` = col_character(), `Country/Region` = col_character()))
recovered <- read_csv(file = "C:/Users/SOUMIK/Desktop/DV J-component/time_series_covid_19_recovered.csv",col_types = cols(.default = col_double(),`Province/State` = col_character(), `Country/Region` = col_character()))
deaths <- read_csv(file = "C:/Users/SOUMIK/Desktop/DV J-component/time_series_covid_19_deaths.csv",col_types = cols(.default = col_double(),`Province/State` = col_character(), `Country/Region` = col_character()))
codes <- read_csv('C:/Users/SOUMIK/Desktop/DV J-component/2014_world_gdp_with_codes.csv',col_types = cols(COUNTRY = col_character(),`GDP (BILLIONS)` = col_double(),CODE = col_character()))
data <- read.csv("C:/Users/SOUMIK/Desktop/DV J-component/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2022.csv")
df <- read.csv("C:/Users/SOUMIK/Desktop/DV J-component/covid_19_india.csv")
dim(confirmed)
## [1] 276 498
dim(recovered)
## [1] 261 498
dim(deaths)
## [1] 276 498
dim(data)
## [1] 385020 14
dim(df)
## [1] 18110 9
head(confirmed)
## # A tibble: 6 × 498
## Provinc…¹ Count…² Lat Long 1/22/…³ 1/23/…⁴ 1/24/…⁵ 1/25/…⁶ 1/26/…⁷ 1/27/…⁸
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghan… 33.9 67.7 0 0 0 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0 0 0 0
## 6 <NA> Antigu… 17.1 -61.8 0 0 0 0 0 0
## # … with 488 more variables: `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>,
## # `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>,
## # `2/4/20` <dbl>, `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>,
## # `2/8/20` <dbl>, `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>,
## # `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>,
## # `2/16/20` <dbl>, `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>,
## # `2/20/20` <dbl>, `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, …
## # ℹ Use `colnames()` to see all variable names
head(recovered)
## # A tibble: 6 × 498
## Provinc…¹ Count…² Lat Long 1/22/…³ 1/23/…⁴ 1/24/…⁵ 1/25/…⁶ 1/26/…⁷ 1/27/…⁸
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghan… 33.9 67.7 0 0 0 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0 0 0 0
## 6 <NA> Antigu… 17.1 -61.8 0 0 0 0 0 0
## # … with 488 more variables: `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>,
## # `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>,
## # `2/4/20` <dbl>, `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>,
## # `2/8/20` <dbl>, `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>,
## # `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>,
## # `2/16/20` <dbl>, `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>,
## # `2/20/20` <dbl>, `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, …
## # ℹ Use `colnames()` to see all variable names
head(deaths)
## # A tibble: 6 × 498
## Provinc…¹ Count…² Lat Long 1/22/…³ 1/23/…⁴ 1/24/…⁵ 1/25/…⁶ 1/26/…⁷ 1/27/…⁸
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> Afghan… 33.9 67.7 0 0 0 0 0 0
## 2 <NA> Albania 41.2 20.2 0 0 0 0 0 0
## 3 <NA> Algeria 28.0 1.66 0 0 0 0 0 0
## 4 <NA> Andorra 42.5 1.52 0 0 0 0 0 0
## 5 <NA> Angola -11.2 17.9 0 0 0 0 0 0
## 6 <NA> Antigu… 17.1 -61.8 0 0 0 0 0 0
## # … with 488 more variables: `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>,
## # `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>, `2/3/20` <dbl>,
## # `2/4/20` <dbl>, `2/5/20` <dbl>, `2/6/20` <dbl>, `2/7/20` <dbl>,
## # `2/8/20` <dbl>, `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>,
## # `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>, `2/15/20` <dbl>,
## # `2/16/20` <dbl>, `2/17/20` <dbl>, `2/18/20` <dbl>, `2/19/20` <dbl>,
## # `2/20/20` <dbl>, `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>, …
## # ℹ Use `colnames()` to see all variable names
head(data)
## Data.As.Of Start.Date End.Date Group Year Month State
## 1 03/27/2022 01/01/2020 03/26/2022 By Total NA NA United States
## 2 03/27/2022 01/01/2020 03/26/2022 By Total NA NA United States
## 3 03/27/2022 01/01/2020 03/26/2022 By Total NA NA United States
## 4 03/27/2022 01/01/2020 03/26/2022 By Total NA NA United States
## 5 03/27/2022 01/01/2020 03/26/2022 By Total NA NA United States
## 6 03/27/2022 01/01/2020 03/26/2022 By Total NA NA United States
## Condition.Group Condition ICD10_codes Age.Group
## 1 Respiratory diseases Influenza and pneumonia J09-J18 0-24
## 2 Respiratory diseases Influenza and pneumonia J09-J18 25-34
## 3 Respiratory diseases Influenza and pneumonia J09-J18 35-44
## 4 Respiratory diseases Influenza and pneumonia J09-J18 45-54
## 5 Respiratory diseases Influenza and pneumonia J09-J18 55-64
## 6 Respiratory diseases Influenza and pneumonia J09-J18 65-74
## COVID.19.Deaths Number.of.Mentions Flag
## 1 1284 1341
## 2 5390 5591
## 3 14166 14726
## 4 35320 36658
## 5 77221 79932
## 6 118051 121479
head(df)
## Sno Date Time State.UnionTerritory ConfirmedIndianNational
## 1 1 2020-01-30 6:00 PM Kerala 1
## 2 2 2020-01-31 6:00 PM Kerala 1
## 3 3 2020-02-01 6:00 PM Kerala 2
## 4 4 2020-02-02 6:00 PM Kerala 3
## 5 5 2020-02-03 6:00 PM Kerala 3
## 6 6 2020-02-04 6:00 PM Kerala 3
## ConfirmedForeignNational Cured Deaths Confirmed
## 1 0 0 0 1
## 2 0 0 0 1
## 3 0 0 0 2
## 4 0 0 0 3
## 5 0 0 0 3
## 6 0 0 0 3
colnames(confirmed)[2]="Country"
colnames(recovered)[2]="Country"
colnames(deaths)[2]="Country"
confirmed <- confirmed %>%
gather("Date", "Confirmed", -c("Province/State", "Country", "Lat", "Long")) %>%
mutate(Date = as.Date(Date, "%m/%d/%y"))
recovered <- recovered %>%
gather("Date", "Recovered", -c("Province/State", "Country", "Lat", "Long")) %>%
mutate(Date = as.Date(Date, "%m/%d/%y"))
deaths <- deaths %>%
gather("Date", "Deaths", -c("Province/State", "Country", "Lat", "Long")) %>%
mutate(Date = as.Date(Date, "%m/%d/%y"))
head(confirmed)
## # A tibble: 6 × 6
## `Province/State` Country Lat Long Date Confirmed
## <chr> <chr> <dbl> <dbl> <date> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 2020-01-22 0
## 2 <NA> Albania 41.2 20.2 2020-01-22 0
## 3 <NA> Algeria 28.0 1.66 2020-01-22 0
## 4 <NA> Andorra 42.5 1.52 2020-01-22 0
## 5 <NA> Angola -11.2 17.9 2020-01-22 0
## 6 <NA> Antigua and Barbuda 17.1 -61.8 2020-01-22 0
head(recovered)
## # A tibble: 6 × 6
## `Province/State` Country Lat Long Date Recovered
## <chr> <chr> <dbl> <dbl> <date> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 2020-01-22 0
## 2 <NA> Albania 41.2 20.2 2020-01-22 0
## 3 <NA> Algeria 28.0 1.66 2020-01-22 0
## 4 <NA> Andorra 42.5 1.52 2020-01-22 0
## 5 <NA> Angola -11.2 17.9 2020-01-22 0
## 6 <NA> Antigua and Barbuda 17.1 -61.8 2020-01-22 0
head(deaths)
## # A tibble: 6 × 6
## `Province/State` Country Lat Long Date Deaths
## <chr> <chr> <dbl> <dbl> <date> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 2020-01-22 0
## 2 <NA> Albania 41.2 20.2 2020-01-22 0
## 3 <NA> Algeria 28.0 1.66 2020-01-22 0
## 4 <NA> Andorra 42.5 1.52 2020-01-22 0
## 5 <NA> Angola -11.2 17.9 2020-01-22 0
## 6 <NA> Antigua and Barbuda 17.1 -61.8 2020-01-22 0
#Merging confirmed, recovered and deaths dataset into one dataset
ts_total <- confirmed %>%
left_join(deaths) %>%
left_join(recovered) %>%
mutate(Recovered = replace_na(Recovered, replace = 0))
## Joining, by = c("Province/State", "Country", "Lat", "Long", "Date")
## Joining, by = c("Province/State", "Country", "Lat", "Long", "Date")
head(ts_total)
## # A tibble: 6 × 8
## `Province/State` Country Lat Long Date Confi…¹ Deaths Recov…²
## <chr> <chr> <dbl> <dbl> <date> <dbl> <dbl> <dbl>
## 1 <NA> Afghanistan 33.9 67.7 2020-01-22 0 0 0
## 2 <NA> Albania 41.2 20.2 2020-01-22 0 0 0
## 3 <NA> Algeria 28.0 1.66 2020-01-22 0 0 0
## 4 <NA> Andorra 42.5 1.52 2020-01-22 0 0 0
## 5 <NA> Angola -11.2 17.9 2020-01-22 0 0 0
## 6 <NA> Antigua and B… 17.1 -61.8 2020-01-22 0 0 0
## # … with abbreviated variable names ¹Confirmed, ²Recovered
dim(ts_total)
## [1] 136344 8
#We all know "Diamond Princess" and "MS Zaandam" are cruises, So we have to remove them from the data
ts_total <- ts_total %>%
filter(Country != "Diamond Princess") %>%
filter(Country != "MS Zaandam")
ts_total$Deaths[is.na(ts_total$Deaths)] <- 0
#Grouping by country, date and incorporating new cases
cases_latest <- ts_total %>%
group_by(Country, Date) %>%
summarise(Confirmed = sum(Confirmed),Recovered = sum(Recovered),Deaths = sum(Deaths)) %>%
mutate("New Cases" = Confirmed - lag(Confirmed, 1) ) %>%
filter(Date == max(Date))
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
day_latest <- max(cases_latest$Date)
head(cases_latest)
## # A tibble: 6 × 6
## # Groups: Country [6]
## Country Date Confirmed Recovered Deaths `New Cases`
## <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 2021-05-29 70111 57281 2899 981
## 2 Albania 2021-05-29 132297 129215 2449 12
## 3 Algeria 2021-05-29 128456 89419 3460 258
## 4 Andorra 2021-05-29 13693 13416 127 0
## 5 Angola 2021-05-29 34180 27646 757 236
## 6 Antigua and Barbuda 2021-05-29 1259 1206 42 0
#Created a dataset including total news of COVID-19
cases_total_date <- ts_total %>%
rename(Region = Country) %>%
group_by(Date) %>%
summarise(Confirmed = sum(Confirmed),
Deaths = sum(Deaths),
Recovered = sum(Recovered)) %>%
mutate("New_Cases" = Confirmed - lag(Confirmed, 1))
cases_total_date$New_Cases[is.na(cases_total_date$New_Cases)] <- 0
cases_total_latest <- cases_total_date %>%
filter(Date == max(Date))
head(cases_total_latest)
## # A tibble: 1 × 5
## Date Confirmed Deaths Recovered New_Cases
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2021-05-29 169950839 3533604 105719754 480835
head(cases_total_date)
## # A tibble: 6 × 5
## Date Confirmed Deaths Recovered New_Cases
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2020-01-22 557 17 30 0
## 2 2020-01-23 655 18 32 98
## 3 2020-01-24 941 26 39 286
## 4 2020-01-25 1433 42 42 492
## 5 2020-01-26 2118 56 56 685
## 6 2020-01-27 2927 82 65 809
codes <- codes %>%
select(COUNTRY, CODE) %>%
rename(Region = COUNTRY ,Code = CODE) %>%
rownames_to_column("id")
head(codes)
## # A tibble: 6 × 3
## id Region Code
## <chr> <chr> <chr>
## 1 1 Afghanistan AFG
## 2 2 Albania ALB
## 3 3 Algeria DZA
## 4 4 American Samoa ASM
## 5 5 Andorra AND
## 6 6 Angola AGO
codes$id <- as.integer(codes$id)
head(codes)
## # A tibble: 6 × 3
## id Region Code
## <int> <chr> <chr>
## 1 1 Afghanistan AFG
## 2 2 Albania ALB
## 3 3 Algeria DZA
## 4 4 American Samoa ASM
## 5 5 Andorra AND
## 6 6 Angola AGO
codes$Region <- codes$Region %>%
str_replace(pattern = "United States", replacement = "US")
#adding country codes with cases_latest dataset
cases_latest_codes <- cases_latest %>%
left_join(codes, by = c("Country" = "Region" )) %>%
arrange(desc(Confirmed))
head(cases_latest_codes)
## # A tibble: 6 × 8
## # Groups: Country [6]
## Country Date Confirmed Recovered Deaths `New Cases` id Code
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <int> <chr>
## 1 US 2021-05-29 33251939 0 594306 11976 212 USA
## 2 India 2021-05-29 27894800 25454320 325972 165553 93 IND
## 3 Brazil 2021-05-29 16471600 14496224 461057 79670 28 BRA
## 4 France 2021-05-29 5719877 390878 109518 11527 71 FRA
## 5 Turkey 2021-05-29 5235978 5094279 47271 7656 205 TUR
## 6 Russia 2021-05-29 4995613 4616422 118781 9155 165 RUS
cases_latest_codes %>%
select(Country,Code, Date, Confirmed, `New Cases`, Recovered, Deaths) %>%
arrange(desc(Confirmed)) %>%
datatable(rownames = FALSE,options = list())
cases_all <- cases_total_date %>%
select(-Confirmed, -New_Cases) %>%
gather("Status", "Cases", -"Date")
barchart <- ggplot(data = cases_total_date, aes(x = Date)) +
geom_bar(aes(y = Confirmed), position = "stack", stat = "identity", fill = "#ff5050") +
geom_bar(data = cases_all, aes(y = Cases, fill = Status), position = "stack", stat = "identity") +
scale_fill_manual(values = c("#000000", "#009900")) +
scale_y_continuous(breaks = seq(0, 21000000, by = 1000000), labels = comma) +
theme(panel.background = element_rect(fill = "White"),
legend.position = "bottom",
axis.title = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()) +
ggtitle("World COVID-19 Total Cases by Day")
ggplotly(barchart)
cases_total_date <- cases_total_date %>%
group_by(Date, Confirmed) %>%
mutate(Mortality_rate = Deaths / Confirmed, Recovery_rate = Recovered / Confirmed) %>%ungroup()
barchart_1 <-cases_total_date %>%
select(Date, Mortality_rate, Recovery_rate) %>%
gather(status.ratio, ratio, -Date ) %>%
ggplot(aes(x = Date, y = ratio, fill = status.ratio)) +
geom_bar(stat = "identity", position = "dodge") +
theme(panel.background = element_rect(fill = "White"),
legend.position = "bottom",
axis.title = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()) +
ggtitle("The Mortality_rate and the Recovery_rate")
ggplotly(barchart_1) %>%
layout(legend = list(orientation = 'h'))
conf<- cases_total_date[,"Confirmed"]
date <- seq(from=as.Date('2020-01-22'),by=1,to = as.Date('2021-05-29'))
calendarPlot(data.frame(conf, date), pollutant = 'Confirmed', year = 2020, main = "Confirmed Cases")
rec <- cases_total_date[,"Recovered"]
calendarPlot(data.frame(rec, date), pollutant = 'Recovered', year = 2020, main = "Recovered Cases", cols = "PiYG")
det <- cases_total_date[,"Deaths"]
calendarPlot(data.frame(det, date), pollutant = 'Deaths', year = 2020, main = "Deaths", cols = "RdGy")
New_Cases <- cases_total_date[,"New_Cases"]
calendarPlot(data.frame(New_Cases, date), pollutant = 'New_Cases', year = 2020, main = "New Cases", cols = "BrBG")
cases_total_date %>%
select(-Date) %>%
na.omit() %>%
cor(use = "pairwise.complete.obs") %>%
corrplot.mixed()
top_10_confirmed <- cases_latest %>%
select(Country, Confirmed) %>%
arrange(desc(Confirmed))
top_10_confirmed[1:10,] %>%
ggplot(aes(x = reorder(Country,Confirmed), y = Confirmed )) +
geom_bar(stat = "identity", fill = "red", width = 0.8) +
scale_y_continuous(labels = scales::comma) +
labs(x = "", y = "", title = "Top 10 (the Most Confirmed Cases)") +
theme(axis.text.x = element_text(angle = 45)) +
theme(axis.title = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 11, face = "bold"))
top_10_Deaths <- cases_latest %>%
select(Country, Deaths) %>%
arrange(desc(Deaths))
top_10_Deaths[1:10,] %>%
ggplot(aes(x = reorder(Country,Deaths), y = Deaths )) +
geom_bar(stat = "identity", fill = "blue", width = 0.8) +
scale_y_continuous(labels = scales::comma) +
labs(x = "", y = "", title = "Top 10 (the Most Deaths)") +
theme(axis.text.x = element_text(angle = 45)) +
theme(axis.title = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 11, face = "bold"))
top_10_Recovered <- cases_latest %>%
select(Country, Recovered) %>%
arrange(desc(Recovered))
top_10_Recovered[1:10,] %>%
ggplot(aes(x = reorder(Country,Recovered), y = Recovered )) +
geom_bar(stat = "identity", fill = "green", width = 0.8) +
scale_y_continuous(labels = scales::comma) +
labs(x = "", y = "", title = "Top 10 (the Most Recovered)") +
theme(axis.text.x = element_text(angle = 45)) +
theme(axis.title = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 11, face = "bold"))
head(cases_latest)
## # A tibble: 6 × 6
## # Groups: Country [6]
## Country Date Confirmed Recovered Deaths `New Cases`
## <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 2021-05-29 70111 57281 2899 981
## 2 Albania 2021-05-29 132297 129215 2449 12
## 3 Algeria 2021-05-29 128456 89419 3460 258
## 4 Andorra 2021-05-29 13693 13416 127 0
## 5 Angola 2021-05-29 34180 27646 757 236
## 6 Antigua and Barbuda 2021-05-29 1259 1206 42 0
options(repr.plot.width = 40, repr.plot.height = 9)
cases_latest %>%
select(Country, Confirmed, Recovered, Deaths) %>%
gather(key = group_var, value = "Cases", -Country, na.rm = TRUE) %>%
group_by(Country, group_var) %>%
summarise(n = sum(Cases), .groups = "drop_last") %>%
arrange(desc(n)) %>%
group_by(group_var) %>%
slice(1:5)%>%
ggplot(aes(x = Country, y = n, fill=Country)) +
geom_bar(stat = "identity") +
facet_grid(~ group_var, scales = "free") +
scale_y_continuous(labels = scales::comma) +
geom_label(aes(label=round(n/1000000, 1)), size=2, fill="white") +
labs(title = "Top Countries per Case Type", subtitle = "Numbers in Millions") + theme(
axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1))
top_6_affected <- ts_total %>%
select(Country, Date, Confirmed, Deaths, Recovered) %>%
filter(Country %in% c("US", "Spain", "Italy", "United Kingdom",
"Russia","France")) %>%
group_by(Country, Date) %>%
summarise(Confirmed = sum(Confirmed),
Recovered = sum(Recovered),
Deaths = sum(Deaths)) %>%
mutate("New_Cases" = Confirmed - lag(Confirmed, 1),
"Recovery_Rate" = Recovered / Confirmed ,
"Mortality_Rate" = Deaths / Confirmed) %>%
mutate("New_Cases" = round(New_Cases, 3),
"Recovery_Rate" = round(Recovery_Rate, 3),
"Mortality_Rate" = round(Mortality_Rate, 3))
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
top_6_affected$New_Cases[is.na(top_6_affected$New_Cases)] <- 0
top_6_affected$Recovery_Rate[is.nan(top_6_affected$Recovery_Rate)] <- 0
top_6_affected$Mortality_Rate[is.nan(top_6_affected$Mortality_Rate)] <- 0
top_6_affected %>%
ggplot(aes(x = Date, y = Mortality_Rate, fill = Country)) +
geom_bar(stat = "identity",alpha = 0.8) +
facet_wrap(~ Country) +
labs(x = "", y = "Mortality Rate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
axis.title.y = element_text(face = "bold", size = 10))
data %>%
group_by(Condition.Group) %>%
summarise(covid_deaths = sum(COVID.19.Deaths, na.rm = TRUE)) %>%
mutate(relative_frequency = covid_deaths/sum(covid_deaths)) %>%
arrange(desc(covid_deaths))
## # A tibble: 12 × 3
## Condition.Group covid…¹ relat…²
## <chr> <int> <dbl>
## 1 Respiratory diseases 1.33e7 0.300
## 2 COVID-19 1.17e7 0.265
## 3 Circulatory diseases 7.82e6 0.176
## 4 All other conditions and causes (residual) 4.72e6 0.107
## 5 Diabetes 1.74e6 0.0393
## 6 Renal failure 1.26e6 0.0283
## 7 Sepsis 1.15e6 0.0259
## 8 Vascular and unspecified dementia 8.90e5 0.0201
## 9 Malignant neoplasms 5.90e5 0.0133
## 10 Obesity 5.64e5 0.0127
## 11 Alzheimer disease 3.25e5 0.00732
## 12 Intentional and unintentional injury, poisoning, and other a… 2.56e5 0.00578
## # … with abbreviated variable names ¹covid_deaths, ²relative_frequency
data %>%
group_by(Age.Group) %>%
summarise(covid_deaths = sum(COVID.19.Deaths, na.rm = TRUE)) %>%
mutate(relative_frequency = covid_deaths/sum(covid_deaths)) %>%
filter(Age.Group != 'All Ages' & Age.Group != 'Not stated' )%>%
arrange(desc(covid_deaths))
## # A tibble: 8 × 3
## Age.Group covid_deaths relative_frequency
## <chr> <int> <dbl>
## 1 75-84 5801833 0.131
## 2 85+ 5431930 0.123
## 3 65-74 5244416 0.118
## 4 55-64 3286716 0.0742
## 5 45-54 1434452 0.0324
## 6 35-44 556273 0.0126
## 7 25-34 202749 0.00457
## 8 0-24 50935 0.00115
comorb_data <- data %>%
filter(Age.Group != 'All Ages' & Age.Group != 'Not stated' ) %>%
filter(Condition.Group != 'COVID-19') %>%
drop_na(COVID.19.Deaths) %>%
select(-c(Flag, Number.of.Mentions, Condition.Group ,ICD10_codes, Data.As.Of, Year, Month, State, Group)) %>%
rename(Age_Group = Age.Group, Covid_Deaths = COVID.19.Deaths, Start_Date = Start.Date, End_Date = End.Date) %>%
arrange(desc(Covid_Deaths))
final_comorb <- comorb_data %>%
group_by(Age_Group, Condition) %>%
summarise(Covid_Deaths = sum(Covid_Deaths, na.rm = T)) %>%
mutate(Relative_Frequency = Covid_Deaths/sum(Covid_Deaths)) %>%
arrange(desc(Covid_Deaths))
## `summarise()` has grouped output by 'Age_Group'. You can override using the
## `.groups` argument.
head(final_comorb)
## # A tibble: 6 × 4
## # Groups: Age_Group [3]
## Age_Group Condition Covid_Deaths Relative_F…¹
## <chr> <chr> <int> <dbl>
## 1 75-84 Influenza and pneumonia 737851 0.172
## 2 65-74 Influenza and pneumonia 708618 0.182
## 3 85+ Influenza and pneumonia 616583 0.157
## 4 75-84 Respiratory failure 614542 0.143
## 5 75-84 All other conditions and causes (residual) 613078 0.143
## 6 65-74 Respiratory failure 578116 0.148
## # … with abbreviated variable name ¹Relative_Frequency
options(repr.plot.width=15, repr.plot.height = 10)
final_comorb %>%
ggplot(mapping = aes(x = Age_Group, y = Condition))+
geom_tile(mapping = aes(fill = Covid_Deaths))
datatable <- df %>%filter(Date == max(Date))
datatable %>%
select(State.UnionTerritory, Date, Confirmed, Cured, Deaths) %>%
arrange(desc(Confirmed)) %>%
datatable(rownames = FALSE,options = list())
Top_6 <-
df %>%
dplyr::filter(Date == max(Date)) %>%
arrange(desc(Confirmed))
Top_6 <- Top_6[1:6,]
percent <- paste(Top_6$State.UnionTerritory ,round(100*Top_6$Confirmed / sum(Top_6$Confirmed), 2), "%")
Top_6 %>%
ggplot(aes(x = "", y = Confirmed, fill = percent) ) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
scale_fill_brewer(palette = "Pastel1") +
theme(panel.grid = element_blank(), panel.background = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank()) +
theme_economist() +
labs(x="", y = "", title = "Confirmed Cases Ratio", fill = "")
Top_6_all <- df %>%
filter(State.UnionTerritory == Top_6$State.UnionTerritory)
## Warning in State.UnionTerritory == Top_6$State.UnionTerritory: longer object
## length is not a multiple of shorter object length
Top_6_all %>%
select(State.UnionTerritory, Date, Confirmed, Cured, Deaths) %>%
gather(status, cases, -c("State.UnionTerritory", "Date")) %>%
ggplot()+
geom_line(aes(Date, cases, group = status, color = status), lwd = 1) +
labs(title="Confirmed/Recovered cases over time by top 6 State/Union Territory",color = "") +
facet_wrap(~ State.UnionTerritory, scales = "free")+ theme_fivethirtyeight()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
model <- lm(Deaths ~ Confirmed + Recovered + `New Cases`, data=cases_latest)
test <- summary(model)
R_sq <- test$r.squared
sprintf("The R-squared value for our model is: %f",R_sq)
## [1] "The R-squared value for our model is: 0.891188"
adj_R_sq <- test$adj.r.squared
sprintf("The adjusted R-squared value for our model is: %f",adj_R_sq)
## [1] "The adjusted R-squared value for our model is: 0.889443"
INFERENCE: The obtained adjusted R_squared value for our model is
0.889553. Therefore, our model is able to explain 88.95% of the
variance.
fitControl <- trainControl(method = "repeatedcv",
number = 10, # number of folds
repeats = 10) # repeated ten times
model.cv <- train(Deaths ~ Confirmed + Recovered + `New Cases`, data = cases_latest, method = "lasso", trControl = fitControl, preProcess = c('scale', 'center'))
model.cv
## The lasso
##
## 191 samples
## 3 predictor
##
## Pre-processing: scaled (3), centered (3)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 171, 171, 172, 173, 171, 172, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 42034.81 0.9093862 22382.456
## 0.5 17624.62 0.9036615 8481.608
## 0.9 29150.02 0.8699733 10161.335
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.5.
INFERENCE: We have used number of folds as 10 and number of
repetitions as 10. The data has been preprocessed (scaling and centre)
before feeding into the model. The final R-squared value obtained was
0.9097 ie 90.97% with the optimum fractions being 0.5.